The tasks included in this report are:
measure_labels <- read.csv('/Users/zeynepenkavi/Dropbox/PoldrackLab/SRO_Retest_Analyses/input/measure_labels.csv')
measure_labels = measure_labels %>% select(-measure_description)
measure_labels = measure_labels %>%
filter(ddm_task == 1)
measure_labels = measure_labels %>%
mutate(dv = as.character(dv)) %>%
separate(dv, c("task_group", "var"), sep = "\\.", remove=FALSE, extra = "merge")
unique(measure_labels$task_group)
[1] "adaptive_n_back" "attention_network_task"
[3] "choice_reaction_time" "directed_forgetting"
[5] "dot_pattern_expectancy" "local_global_letter"
[7] "motor_selective_stop_signal" "recent_probes"
[9] "shape_matching" "simon"
[11] "stim_selective_stop_signal" "stop_signal"
[13] "stroop" "threebytwo"
adaptive_n_back
attention_network_task
choice_reaction_time
directed_forgetting
dot_pattern_expectancy
local_global_letter
motor_selective_stop_signal
recent_probes
shape_matching
simon
stim_selective_stop_signal
stop_signal
stroop
threebytwo
test_data <- read.csv(paste0(retest_data_path,'t1_data/variables_exhaustive.csv'))
test_data_subs = as.character(test_data$X)
test_data = test_data %>%
select(measure_labels$dv)
test_data$sub_id = test_data_subs
rm(test_data_subs)
retest_data <- read.csv(paste0(retest_data_path,'variables_exhaustive.csv'))
retest_data_subs = as.character(retest_data$X)
retest_data = retest_data %>%
select(measure_labels$dv)
retest_data$sub_id = retest_data_subs
retest_data = retest_data[retest_data$sub_id %in% test_data$sub_id,]
rm(retest_data_subs)
Since HDDM parameters depend on the sample on which they are fit we also refit the model on t1 data only for the subjects that have t2 data.
hddm_refits <- read.csv(paste0(retest_data_path,'t1_data/hddm_refits_exhaustive.csv'))
test_data_hddm_fullfit <- test_data
tmp = names(test_data_hddm_fullfit)[names(test_data_hddm_fullfit) %in% names(hddm_refits) == FALSE]
hddm_refit_subs = as.character(hddm_refits$X)
hddm_refits = hddm_refits[, c(names(hddm_refits) %in% measure_labels$dv)]
hddm_refits$sub_id <- hddm_refit_subs
rm(hddm_refit_subs)
###RERAN MOTOR SS FOR REFIT (MISSING REACTIVE CONTROL HDDM DRIFT) - still missing; while missing the values from fitting to the full sample are used
# after correction this should just be sub_id
#tmp
test_data_hddm_refit = test_data_hddm_fullfit %>%
select(tmp)
#merge hddm refits to test data
test_data_hddm_refit = merge(test_data_hddm_refit, hddm_refits, by="sub_id")
rm(test_data, hddm_refits, tmp)
#for matching function to work properly assign name back to test_data
test_data = test_data_hddm_fullfit
numeric_cols = get_numeric_cols()
#Create df of point estimate reliabilities
rel_df_cols = c('icc', 'pearson', 'var_subs', 'var_ind', 'var_resid')
rel_df = as.data.frame(matrix(ncol = length(rel_df_cols)))
names(rel_df) = rel_df_cols
for(i in 1:length(numeric_cols)){
tmp = get_retest_stats(numeric_cols[i], metric = c('icc', 'pearson', 'var_breakdown'))
if(i == 1){
rel_df = tmp
}
else{
rel_df = rbind(rel_df, tmp)
}
}
row.names(rel_df) <- numeric_cols
rel_df$dv = row.names(rel_df)
row.names(rel_df) = seq(1:nrow(rel_df))
Reassign df’s and clean up
rel_df_fullfit = rel_df
rm(test_data, rel_df, tmp, i, numeric_cols, rel_df_cols)
#for matching function to work properly assign name back to test_data
test_data = test_data_hddm_refit
numeric_cols = get_numeric_cols()
#Create df of point estimate reliabilities
rel_df_cols = c('icc', 'pearson', 'var_subs', 'var_ind', 'var_resid')
rel_df = as.data.frame(matrix(ncol = length(rel_df_cols)))
names(rel_df) = rel_df_cols
for(i in 1:length(numeric_cols)){
tmp = get_retest_stats(numeric_cols[i], metric = c('icc', 'pearson', 'var_breakdown'))
if(i == 1){
rel_df = tmp
}
else{
rel_df = rbind(rel_df, tmp)
}
}
row.names(rel_df) <- numeric_cols
rel_df$dv = row.names(rel_df)
row.names(rel_df) = seq(1:nrow(rel_df))
Reassign df’s and clean up
rel_df_refit = rel_df
rm(test_data, rel_df, tmp, i, numeric_cols, rel_df_cols)
Extract ddm vars only
Using full sample fits’ reliabilities
fullfit_boot_df <- read.csv(gzfile(paste0(retest_data_path,'bootstrap_merged.csv.gz')))
Warning in scan(file = file, what = what, sep = sep, quote = quote, dec =
dec, : embedded nul(s) found in input
fullfit_boot_df = process_boot_df(fullfit_boot_df)
fullfit_boot_df = fullfit_boot_df[fullfit_boot_df$dv %in% measure_labels$dv,]
Refits bootstrapped reliabilities
refit_boot_df = read.csv(gzfile(paste0(retest_data_path,'refits_bootstrap_merged.csv.gz')))
Warning in scan(file = file, what = what, sep = sep, quote = quote, dec =
dec, : embedded nul(s) found in input
refit_boot_df = process_boot_df(refit_boot_df)
Before going on with the rest of the report first decide on whether there are significant differences in the distributions of the HDDM parameter estimates and their reliabilities depending on whether they are fit on the full sample (n=552) or retest sample (n=150) for t1 data.
Differences in distributions: using scaled differences
hddm_pars_fullfit = test_data_hddm_fullfit %>%
select(sub_id, unique(refit_boot_df$dv)) %>%
mutate(hddm_sample = "fullfit")
hddm_pars_refit = test_data_hddm_refit %>%
select(sub_id, unique(refit_boot_df$dv)) %>%
mutate(hddm_sample = "refit")
hddm_pars = rbind(hddm_pars_fullfit, hddm_pars_refit)
rm(hddm_pars_fullfit, hddm_pars_refit)
hddm_pars = hddm_pars %>%
gather(dv, value, -sub_id, -hddm_sample) %>%
spread(hddm_sample, value) %>%
mutate(diff = fullfit - refit) %>%
group_by(dv) %>%
mutate(scaled_diff = scale(diff)) %>%
select(sub_id, dv, scaled_diff) %>%
left_join(measure_labels[,c("dv", "rt_acc")], by = "dv") %>%
na.omit()
hddm_pars %>%
ggplot(aes(scaled_diff))+
geom_density(aes(fill = dv), alpha = 0.2, color = NA)+
geom_density(fill = "black", alpha = 1, color=NA)+
theme(legend.position = "none")+
xlab("Scaled difference of HDDM parameter estimate (full - refit)")
Does the distribution of scaled difference scores (between using n=150 or n=552) have a mean different than 0 allowing for random effects of parameter accounting for the different types of parameters? No.
summary(lmer(scaled_diff ~ rt_acc + (1|dv), hddm_pars))
Linear mixed model fit by REML ['lmerMod']
Formula: scaled_diff ~ rt_acc + (1 | dv)
Data: hddm_pars
REML criterion at convergence: 25057
Scaled residuals:
Min 1Q Median 3Q Max
-10.438 -0.501 -0.007 0.460 9.750
Random effects:
Groups Name Variance Std.Dev.
dv (Intercept) 1.42e-33 3.77e-17
Residual 9.93e-01 9.97e-01
Number of obs: 8845, groups: dv, 62
Fixed effects:
Estimate Std. Error t value
(Intercept) 1.90e-17 1.45e-02 0
rt_accnon-decision -3.92e-17 2.66e-02 0
rt_accthreshold 1.73e-20 2.60e-02 0
Correlation of Fixed Effects:
(Intr) rt_cc-
rt_ccnn-dcs -0.545
rt_ccthrshl -0.558 0.304
# Same result
# summary(MCMCglmm(scaled_diff ~ rt_acc, random = ~ dv, data=hddm_pars))
Are there differences in reliability depending which sample the HDDM parameters are estimated from? No.
hddm_rels_fullfit = rel_df_fullfit %>%
filter(dv %in% unique(refit_boot_df$dv)) %>%
select(dv, icc)
hddm_rels_refit = rel_df_refit %>%
filter(dv %in% unique(refit_boot_df$dv)) %>%
select(dv, icc)
hddm_rels = hddm_rels_fullfit %>%
left_join(hddm_rels_refit, by = "dv") %>%
rename(fullfit = icc.x, refit = icc.y) %>%
left_join(measure_labels[,c("dv", "rt_acc")], by = "dv")
rm(hddm_rels_fullfit, hddm_rels_refit)
hddm_rels %>%
ggplot(aes(fullfit, refit, col=rt_acc))+
geom_point()+
geom_abline(slope=1, intercept = 0)+
xlab("Reliability of HDDM params using n=552")+
ylab("Reliability of HDDM params using n=150")+
theme(legend.title = element_blank())
hddm_rels = hddm_rels %>%
gather(sample, value, -dv, -rt_acc)
summary(lmer(value ~ sample*rt_acc + (1|dv), hddm_rels))
Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control
$checkConv, : Model failed to converge with max|grad| = 0.127809 (tol =
0.002, component 1)
Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, : Model is nearly unidentifiable: very large eigenvalue
- Rescale variables?
Linear mixed model fit by REML ['lmerMod']
Formula: value ~ sample * rt_acc + (1 | dv)
Data: hddm_rels
REML criterion at convergence: -1890
Scaled residuals:
Min 1Q Median 3Q Max
-4.55e-07 -3.43e-08 4.37e-08 9.14e-08 2.42e-07
Random effects:
Groups Name Variance Std.Dev.
dv (Intercept) 1.98e-02 1.41e-01
Residual 7.80e-16 2.79e-08
Number of obs: 124, groups: dv, 62
Fixed effects:
Estimate Std. Error t value
(Intercept) 4.74e-01 2.32e-02 20.41
samplerefit -8.61e-16 6.88e-09 0.00
rt_accnon-decision -5.48e-02 4.42e-02 -1.24
rt_accthreshold 1.98e-02 4.32e-02 0.46
samplerefit:rt_accnon-decision 9.88e-16 1.26e-08 0.00
samplerefit:rt_accthreshold 1.22e-15 1.23e-08 0.00
Correlation of Fixed Effects:
(Intr) smplrf rt_cc- rt_cct smp:_-
samplerefit 0.000
rt_ccnn-dcs -0.525 0.000
rt_ccthrshl -0.538 0.000 0.283
smplrft:r_- 0.000 -0.546 0.000 0.000
smplrft:rt_ 0.000 -0.559 0.000 0.000 0.305
convergence code: 0
Model failed to converge with max|grad| = 0.127809 (tol = 0.002, component 1)
Model is nearly unidentifiable: very large eigenvalue
- Rescale variables?
You should compare model fits using either sample. That would tell you which estimates explain observed data better though given the lack of difference in parameter estimates I don’t expect these to differ either.
Conclusion: Going to use parameter estimates from refits to retest sample only for t1 data in the rest of this report because I think they are more comparable to the parameter estimates from t2.
Clean up
rm(hddm_pars, hddm_rels)
boot_df = fullfit_boot_df %>%
filter(dv %in% unique(refit_boot_df$dv) == FALSE)
boot_df = rbind(boot_df, refit_boot_df)
rm(fullfit_boot_df, refit_boot_df)
rel_df = rel_df_refit
rm(rel_df_fullfit, rel_df_refit)
test_data = test_data_hddm_refit
rm(test_data_hddm_fullfit, test_data_hddm_refit)
Data wrangling df’s containing point estimate reliabilities and bootstrapped reliabilities
rel_df = rel_df %>%
select(dv, icc, var_subs, var_ind, var_resid) %>%
left_join(measure_labels[,c("dv", "task_group","overall_difference","raw_fit","rt_acc")], by = "dv") %>%
mutate(ddm_raw = ifelse(raw_fit == "raw", "raw", "ddm"),
contrast = ifelse(overall_difference == "difference", "contrast", "non-contrast"))
boot_df = boot_df %>%
select(dv, icc, var_subs, var_ind, var_resid) %>%
left_join(measure_labels[,c("dv", "task_group","overall_difference","raw_fit","rt_acc")], by = "dv") %>%
mutate(ddm_raw = ifelse(raw_fit == "raw", "raw", "ddm"),
contrast = ifelse(overall_difference == "difference", "contrast", "non-contrast"))
Plot reliability point estimates comparing DDM measures to raw measures faceting for contrast measures.
ddm_point_plot = rel_df %>%
mutate(rt_acc = as.character(rt_acc)) %>%
filter(rt_acc != "other") %>%
ggplot(aes(factor(raw_fit, levels = c("raw", "EZ", "hddm"), labels=c("Raw", "EZ-diffusion", "Hierarchical diffusion")), icc, fill=factor(rt_acc, levels = c("rt","accuracy", "drift rate", "threshold", "non-decision"), labels=c("Response Time", "Accuracy","Drift Rate", "Threshold", "Non-decision"))))+
geom_boxplot()+
facet_wrap(~factor(contrast, levels=c("non-contrast", "contrast"), labels=c("Non-contrast", "Contrast")))+
ylab("ICC")+
xlab("")+
theme(legend.title = element_blank(),
legend.position = 'bottom')+
guides(fill = guide_legend(ncol = 2, byrow=F))
mylegend<-g_legend(ddm_point_plot)
grob_name <- names(mylegend$grobs)[1]
#manually fix the legend
#move non-decision down
#key
mylegend$grobs[grob_name][[1]]$layout[11,c(1:4)] <- c(4,8,4,8)
mylegend$grobs[grob_name][[1]]$layout[12,c(1:4)] <- c(4,8,4,8)
#text
mylegend$grobs[grob_name][[1]]$layout[17,c(1:4)] <- c(4,10,4,10)
#move threshold down
#key
mylegend$grobs[grob_name][[1]]$layout[9,c(1:4)] <- c(3,8,3,8)
mylegend$grobs[grob_name][[1]]$layout[10,c(1:4)] <- c(3,8,3,8)
#text
mylegend$grobs[grob_name][[1]]$layout[16,c(1:4)] <- c(3,10,3,10)
#move drift rate right and up
#key
mylegend$grobs[grob_name][[1]]$layout[7,c(1:4)] <- c(2,8,2,8)
mylegend$grobs[grob_name][[1]]$layout[8,c(1:4)] <- c(2,8,2,8)
#text
mylegend$grobs[grob_name][[1]]$layout[15,c(1:4)] <- c(2,10,2,10)
grid.arrange(ddm_point_plot +theme(legend.position="none"),
mylegend, nrow=2, heights=c(10, 1))
rm(mylegend, ddm_point_plot)
Plot averaged bootstrapped reliability estimates per measure comparing DDM measures to raw measures faceting for contrast measures.
ddm_boot_plot = boot_df %>%
group_by(dv) %>%
summarise(mean_icc = mean(icc),
rt_acc = unique(rt_acc),
overall_difference = unique(overall_difference),
raw_fit = unique(raw_fit),
contrast = unique(contrast)) %>%
mutate(rt_acc = as.character(rt_acc)) %>%
filter(rt_acc != "other") %>%
ggplot(aes(factor(raw_fit, levels = c("raw", "EZ", "hddm"), labels=c("Raw", "EZ-diffusion", "Hierarchical diffusion")), mean_icc, fill=factor(rt_acc, levels = c("rt","accuracy", "drift rate", "threshold", "non-decision"), labels=c("Response Time", "Accuracy","Drift Rate", "Threshold", "Non-decision"))))+
geom_boxplot()+
facet_wrap(~factor(contrast, levels=c("non-contrast", "contrast"), labels=c("Non-contrast", "Contrast")))+
ylab("ICC")+
xlab("")+
theme(legend.title = element_blank(),
legend.position = 'bottom')+
guides(fill = guide_legend(ncol = 2, byrow=F))
mylegend<-g_legend(ddm_boot_plot)
grob_name <- names(mylegend$grobs)[1]
#manually fix the legend
#move non-decision down
#key
mylegend$grobs[grob_name][[1]]$layout[11,c(1:4)] <- c(4,8,4,8)
mylegend$grobs[grob_name][[1]]$layout[12,c(1:4)] <- c(4,8,4,8)
#text
mylegend$grobs[grob_name][[1]]$layout[17,c(1:4)] <- c(4,10,4,10)
#move threshold down
#key
mylegend$grobs[grob_name][[1]]$layout[9,c(1:4)] <- c(3,8,3,8)
mylegend$grobs[grob_name][[1]]$layout[10,c(1:4)] <- c(3,8,3,8)
#text
mylegend$grobs[grob_name][[1]]$layout[16,c(1:4)] <- c(3,10,3,10)
#move drift rate right and up
#key
mylegend$grobs[grob_name][[1]]$layout[7,c(1:4)] <- c(2,8,2,8)
mylegend$grobs[grob_name][[1]]$layout[8,c(1:4)] <- c(2,8,2,8)
#text
mylegend$grobs[grob_name][[1]]$layout[15,c(1:4)] <- c(2,10,2,10)
grid.arrange(ddm_boot_plot +theme(legend.position="none"),
mylegend, nrow=2, heights=c(10, 1))
rm(mylegend, ddm_boot_plot)
Model testing if the reliability of raw measures differs from that of ddm estimates and if contrast measures differ from non-contrast measures.
Checking if both fixed effects of raw vs ddm and contrast vs non-contrast as well as their interaction is necessary.
Conclusion: Model with fixed effect just for contrast and not for raw vs ddm and no interaction is the best. But to answer the question on lack of difference for ddm vs raw I’ll look at the model with both fixed effects for now.
mer1 = lmer(icc ~ ddm_raw + (1|dv), boot_df %>% filter(rt_acc != "other"))
mer1a = lmer(icc ~ contrast + (1|dv), boot_df %>% filter(rt_acc != "other"))
mer2 = lmer(icc ~ ddm_raw + contrast + (1|dv), boot_df %>% filter(rt_acc != "other"))
mer3 = lmer(icc ~ ddm_raw * contrast + (1|dv), boot_df %>% filter(rt_acc != "other"))
anova(mer1, mer2, mer3)
refitting model(s) with ML (instead of REML)
anova(mer1a, mer2, mer3)
refitting model(s) with ML (instead of REML)
rm(mer1, mer3, mer1a)
Raw measures do not significantly differ from ddm parameters in their reliability but non-contrast measures are significantly more reliable compared to contrast measures.
summary(mer2)
Linear mixed model fit by REML ['lmerMod']
Formula: icc ~ ddm_raw + contrast + (1 | dv)
Data: boot_df %>% filter(rt_acc != "other")
REML criterion at convergence: -497522
Scaled residuals:
Min 1Q Median 3Q Max
-11.647 -0.472 0.041 0.525 6.220
Random effects:
Groups Name Variance Std.Dev.
dv (Intercept) 0.03484 0.1866
Residual 0.00901 0.0949
Number of obs: 267000, groups: dv, 267
Fixed effects:
Estimate Std. Error t value
(Intercept) 0.2359 0.0197 11.95
ddm_rawraw 0.0262 0.0233 1.12
contrastnon-contrast 0.3529 0.0234 15.07
Correlation of Fixed Effects:
(Intr) ddm_rw
ddm_rawraw -0.415
cntrstnn-cn -0.654 -0.107
What is the best measure of individual difference for any measure that has both raw and DDM parameters?
Even though overall the ddm parameters are not significantly less reliable the most reliable measure is more frequently a raw measure. There are some examples of an EZ estimate being the best for a task as well. Regardless of raw vs ddm the best measure is always a non-contrast measure.
rel_df %>%
group_by(task_group) %>%
filter(icc == max(icc)) %>%
select(task_group, everything())
Converting variance estimates for each measure to percentage of all variance for comparability.
rel_df = rel_df %>%
mutate(var_subs_pct = (var_subs/(var_subs+var_ind+var_resid))*100,
var_ind_pct = (var_ind/(var_subs+var_ind+var_resid))*100,
var_resid_pct = (var_resid/(var_subs+var_ind+var_resid))*100)
boot_df = boot_df %>%
mutate(var_subs_pct = (var_subs/(var_subs+var_ind+var_resid))*100,
var_ind_pct = (var_ind/(var_subs+var_ind+var_resid))*100,
var_resid_pct = (var_resid/(var_subs+var_ind+var_resid))*100)
rel_df %>%
filter(rt_acc != "other") %>%
group_by(rt_acc, contrast, raw_fit) %>%
summarise(mean_var_subs_pct = mean(var_subs_pct),
mean_var_ind_pct = mean(var_ind_pct),
mean_var_resid_pct = mean(var_resid_pct),
sem_var_subs_pct = sem(var_subs_pct),
sem_var_ind_pct = sem(var_ind_pct),
sem_var_resid_pct = sem(var_resid_pct)) %>%
ggplot(aes(factor(raw_fit, levels = c("raw", "EZ", "hddm"), labels=c("Raw", "EZ-diffusion", "Hierarchical diffusion")), mean_var_subs_pct, shape=factor(rt_acc, levels = c("rt","accuracy", "drift rate", "threshold", "non-decision"), labels=c("Response Time", "Accuracy","Drift Rate", "Threshold", "Non-decision"))))+
geom_point(position=position_dodge(width=0.75), size = 5)+
geom_errorbar(aes(ymin = mean_var_subs_pct - sem_var_subs_pct, ymax = mean_var_subs_pct + sem_var_subs_pct), position=position_dodge(width=0.75))+
facet_wrap(~factor(contrast, levels=c("non-contrast", "contrast"), labels=c("Non-contrast", "Contrast")))+
ylab("% of between subjects variance")+
xlab("")+
theme(legend.title = element_blank(),
legend.position = 'bottom')
rel_df %>%
filter(rt_acc != "other") %>%
group_by(rt_acc, contrast, raw_fit) %>%
summarise(mean_var_subs_pct = mean(var_subs_pct),
mean_var_ind_pct = mean(var_ind_pct),
mean_var_resid_pct = mean(var_resid_pct),
sem_var_subs_pct = sem(var_subs_pct),
sem_var_ind_pct = sem(var_ind_pct),
sem_var_resid_pct = sem(var_resid_pct)) %>%
ggplot(aes(factor(raw_fit, levels = c("raw", "EZ", "hddm"), labels=c("Raw", "EZ-diffusion", "Hierarchical diffusion")), mean_var_resid_pct, shape=factor(rt_acc, levels = c("rt","accuracy", "drift rate", "threshold", "non-decision"), labels=c("Response Time", "Accuracy","Drift Rate", "Threshold", "Non-decision"))))+
geom_point(position=position_dodge(width=0.75), size = 5)+
geom_errorbar(aes(ymin = mean_var_resid_pct - sem_var_resid_pct, ymax = mean_var_resid_pct + sem_var_resid_pct), position=position_dodge(width=0.75))+
facet_wrap(~factor(contrast, levels=c("non-contrast", "contrast"), labels=c("Non-contrast", "Contrast")))+
ylab("% of residual variance")+
xlab("")+
theme(legend.title = element_blank(),
legend.position = 'bottom')
Model testing if the percentage of between subjects variance of raw measures differs from that of ddm estimates and if contrast measures differ from non-contrast measures.
Checking if both fixed effects of raw vs ddm and contrast vs non-contrast as well as their interaction is necessary.
Conclusion: Model with fixed effects for both is best.
mer1 = lmer(var_subs_pct ~ ddm_raw + (1|dv), boot_df %>% filter(rt_acc != "other"))
mer1a = lmer(var_subs_pct ~ contrast + (1|dv), boot_df %>% filter(rt_acc != "other"))
mer2 = lmer(var_subs_pct ~ ddm_raw + contrast + (1|dv), boot_df %>% filter(rt_acc != "other"))
mer3 = lmer(var_subs_pct ~ ddm_raw * contrast + (1|dv), boot_df %>% filter(rt_acc != "other"))
anova(mer1, mer2, mer3)
refitting model(s) with ML (instead of REML)
anova(mer1a, mer2, mer3)
refitting model(s) with ML (instead of REML)
rm(mer1, mer1a, mer3)
Raw and non-contrast measures have higher between subjects variance (ie are better individual difference measures)
summary(mer2)
Linear mixed model fit by REML ['lmerMod']
Formula: var_subs_pct ~ ddm_raw + contrast + (1 | dv)
Data: boot_df %>% filter(rt_acc != "other")
REML criterion at convergence: 2002344
Scaled residuals:
Min 1Q Median 3Q Max
-4.126 -0.659 0.076 0.674 4.702
Random effects:
Groups Name Variance Std.Dev.
dv (Intercept) 171 13.1
Residual 105 10.2
Number of obs: 267000, groups: dv, 267
Fixed effects:
Estimate Std. Error t value
(Intercept) 41.64 1.39 30.05
ddm_rawraw 4.65 1.64 2.84
contrastnon-contrast 9.54 1.64 5.81
Correlation of Fixed Effects:
(Intr) ddm_rw
ddm_rawraw -0.415
cntrstnn-cn -0.654 -0.107
Model testing if the percentage of residual variance of raw measures differs from that of ddm estimates and if contrast measures differ from non-contrast measures.
Checking if both fixed effects of raw vs ddm and contrast vs non-contrast as well as their interaction is necessary.
Conclusion: Model with fixed effect just for contrast is best. And plot above already shows that contrast measures have higher residual variance.
mer1 = lmer(var_resid_pct ~ ddm_raw + (1|dv), boot_df %>% filter(rt_acc != "other"))
mer1a = lmer(var_resid_pct ~ contrast + (1|dv), boot_df %>% filter(rt_acc != "other"))
mer2 = lmer(var_resid_pct ~ ddm_raw + contrast + (1|dv), boot_df %>% filter(rt_acc != "other"))
mer3 = lmer(var_resid_pct ~ ddm_raw * contrast + (1|dv), boot_df %>% filter(rt_acc != "other"))
anova(mer1, mer2, mer3)
refitting model(s) with ML (instead of REML)
anova(mer1a, mer2, mer3)
refitting model(s) with ML (instead of REML)
rm(mer1, mer1a, mer2, mer3)
Differences in DDM parameter reliability for t1 data using either n=552 or n=150 were reported above under T1 HDDM parameters. No meaningful differences exist between these two sample sizes.
But even 150 is a large sample size for psychological studies, especially forced choice reaction time tasks that are included in this report. Here we look at how the reliability for raw and ddm measures change for sample sizes that are more common in studies using these tasks (25, 50, 75, 100, 125, 150)
Note: Not refitting HDDM’s for each of these sample sizes since a. there were no differences in parameter stability for n=150 vs 552 and b. a more comprehensive comparison using non-hierarchical estimates and model fit indices will follow. [revisit this - 150 and 552 might be too large to lead to changes in parameter estimates but smaller samples that are more common in psych studies might sway estimates more]
rel_df_sample_size = read.csv(gzfile(paste0(input_path, 'rel_df_sample_size.csv.gz')))
#Check if all vars are there
# numeric_cols[which(numeric_cols %in% unique(rel_df_sample_size$dv) == FALSE)]
rel_df_sample_size = rel_df_sample_size %>%
left_join(measure_labels[,c("dv", "task_group","overall_difference","raw_fit","rt_acc")], by = "dv") %>%
mutate(ddm_raw = ifelse(raw_fit == "raw", "raw", "ddm"),
contrast = ifelse(overall_difference == "difference", "contrast", "non-contrast"),
contrast = factor(contrast, levels=c("non-contrast", "contrast")),
var_subs_pct = (var_subs/(var_subs+var_ind+var_resid))*100,
var_ind_pct = (var_ind/(var_subs+var_ind+var_resid))*100,
var_resid_pct = (var_resid/(var_subs+var_ind+var_resid))*100)
Warning: Column `dv` joining factor and character vector, coercing into
character vector
rel_df_sample_size_summary = rel_df_sample_size %>%
group_by(dv, sample_size, ddm_raw, contrast) %>%
summarise(mean_icc = mean(icc),
sem_icc = sem(icc),
mean_var_subs_pct = mean(var_subs_pct),
sem_var_subs_pct = sem(var_subs_pct),
mean_var_ind_pct = mean(var_ind_pct),
sem_var_ind_pct = sem(var_ind_pct),
mean_var_resid_pct = mean(var_resid_pct),
sem_var_resid_pct = sem(var_resid_pct))
tmp = rel_df_sample_size %>%
group_by(sample_size) %>%
summarise(mean_icc = mean(icc),
sem_icc = sem(icc),
mean_var_subs_pct = mean(var_subs_pct),
sem_var_subs_pct = sem(var_subs_pct),
mean_var_ind_pct = mean(var_ind_pct),
sem_var_ind_pct = sem(var_ind_pct),
mean_var_resid_pct = mean(var_resid_pct),
sem_var_resid_pct = sem(var_resid_pct))
Does the mean reliability change with sample size?
Yes. The larger the sample size the more reliable is a given measure on average. The largest increase in reliability is when shifting from 25 to 50 subjects. This is important because many studies using these measures have sample sizes <50 per group.
rel_df_sample_size_summary %>%
ggplot(aes(sample_size, mean_icc))+
geom_line(aes(group = dv), alpha = 0.1)+
geom_line(data = tmp,aes(sample_size,mean_icc), color="red")+
geom_point(data = tmp,aes(sample_size,mean_icc), color="red")+
geom_errorbar(data = tmp,aes(ymin=mean_icc-sem_icc, ymax = mean_icc+sem_icc), color="red", width = 0.1)+
ylab("Mean reliability of 100 samples of size n")+
xlab("Sample size")
summary(lmer(icc ~ factor(sample_size) + (1|dv) + (1|iteration), rel_df_sample_size))
Linear mixed model fit by REML ['lmerMod']
Formula: icc ~ factor(sample_size) + (1 | dv) + (1 | iteration)
Data: rel_df_sample_size
REML criterion at convergence: -84292
Scaled residuals:
Min 1Q Median 3Q Max
-44.35 -0.30 0.03 0.41 5.37
Random effects:
Groups Name Variance Std.Dev.
dv (Intercept) 0.07084 0.2661
iteration (Intercept) 0.00016 0.0127
Residual 0.03108 0.1763
Number of obs: 136500, groups: dv, 273; iteration, 100
Fixed effects:
Estimate Std. Error t value
(Intercept) 0.40877 0.01619 25.2
factor(sample_size)50 0.02702 0.00151 17.9
factor(sample_size)75 0.03640 0.00151 24.1
factor(sample_size)100 0.04131 0.00151 27.4
factor(sample_size)125 0.04358 0.00151 28.9
Correlation of Fixed Effects:
(Intr) f(_)50 f(_)75 f(_)10
fctr(sm_)50 -0.047
fctr(sm_)75 -0.047 0.500
fctr(s_)100 -0.047 0.500 0.500
fctr(s_)125 -0.047 0.500 0.500 0.500
Does the change in reliabiliity with sample size vary by variable type?
The changes do not differ by raw vs. ddm measures. It does differ by whether the measure is a contrast measure: For contrast measures all increases in reliability with sample size are larger than the increases for non-contrast measures. This implies that larger sample sizes are even more crucial for studies using contrast measures as trait-level measures.
tmp = rel_df_sample_size %>%
group_by(sample_size, ddm_raw, contrast) %>%
summarise(mean_icc = mean(icc),
sem_icc = sem(icc),
mean_var_subs_pct = mean(var_subs_pct),
sem_var_subs_pct = sem(var_subs_pct),
mean_var_ind_pct = mean(var_ind_pct),
sem_var_ind_pct = sem(var_ind_pct),
mean_var_resid_pct = mean(var_resid_pct),
sem_var_resid_pct = sem(var_resid_pct))
rel_df_sample_size_summary %>%
ggplot(aes(sample_size, mean_icc))+
geom_line(aes(group = dv, color=ddm_raw), alpha = 0.1)+
geom_line(data = tmp, aes(sample_size,mean_icc, color=ddm_raw))+
geom_point(data = tmp, aes(sample_size,mean_icc, color=ddm_raw))+
geom_errorbar(data = tmp,aes(ymin=mean_icc-sem_icc, ymax = mean_icc+sem_icc), color="red", width = 0.1)+
facet_wrap(~contrast)+
ylab("Mean reliability of 100 samples of size n")+
xlab("Sample size")+
theme(legend.title = element_blank(),
legend.position = "bottom")
summary(lmer(icc ~ factor(sample_size) * contrast + (1|dv) + (1|iteration), rel_df_sample_size))
Linear mixed model fit by REML ['lmerMod']
Formula: icc ~ factor(sample_size) * contrast + (1 | dv) + (1 | iteration)
Data: rel_df_sample_size
REML criterion at convergence: -84761
Scaled residuals:
Min 1Q Median 3Q Max
-44.30 -0.30 0.04 0.40 5.48
Random effects:
Groups Name Variance Std.Dev.
dv (Intercept) 0.03727 0.1931
iteration (Intercept) 0.00016 0.0127
Residual 0.03100 0.1761
Number of obs: 136500, groups: dv, 273; iteration, 100
Fixed effects:
Estimate Std. Error t value
(Intercept) 0.57368 0.01528 37.5
factor(sample_size)50 0.01585 0.00196 8.1
factor(sample_size)75 0.01985 0.00196 10.1
factor(sample_size)100 0.02295 0.00196 11.7
factor(sample_size)125 0.02346 0.00196 12.0
contrastcontrast -0.40560 0.02389 -17.0
factor(sample_size)50:contrastcontrast 0.02748 0.00307 9.0
factor(sample_size)75:contrastcontrast 0.04072 0.00307 13.3
factor(sample_size)100:contrastcontrast 0.04515 0.00307 14.7
factor(sample_size)125:contrastcontrast 0.04948 0.00307 16.1
Correlation of Fixed Effects:
(Intr) fc(_)50 fc(_)75 fc(_)100 fc(_)125 cntrst f(_)50:
fctr(sm_)50 -0.064
fctr(sm_)75 -0.064 0.500
fctr(s_)100 -0.064 0.500 0.500
fctr(s_)125 -0.064 0.500 0.500 0.500
cntrstcntrs -0.635 0.041 0.041 0.041 0.041
fctr(s_)50: 0.041 -0.638 -0.319 -0.319 -0.319 -0.064
fctr(s_)75: 0.041 -0.319 -0.638 -0.319 -0.319 -0.064 0.500
fctr(_)100: 0.041 -0.319 -0.319 -0.638 -0.319 -0.064 0.500
fctr(_)125: 0.041 -0.319 -0.319 -0.319 -0.638 -0.064 0.500
f(_)75: f(_)100:
fctr(sm_)50
fctr(sm_)75
fctr(s_)100
fctr(s_)125
cntrstcntrs
fctr(s_)50:
fctr(s_)75:
fctr(_)100: 0.500
fctr(_)125: 0.500 0.500
Does variability of reliability change with sample size?
Yes. Variability of reliability decreases with sample size. It does so more strongly for contrast measures.
rel_df_sample_size_summary %>%
ggplot(aes(sample_size, sem_icc))+
geom_line(aes(group = dv, color=ddm_raw), alpha = 0.1)+
geom_line(data = tmp, aes(sample_size,sem_icc, color=ddm_raw))+
geom_point(data = tmp, aes(sample_size,sem_icc, color=ddm_raw))+
facet_wrap(~contrast)+
ylab("Standard error of mean of reliability \n of 100 samples of size n")+
xlab("Sample size")+
theme(legend.title = element_blank(),
legend.position = "bottom")
summary(lmer(sem_icc ~ factor(sample_size) * contrast + (1|dv), rel_df_sample_size_summary))
Linear mixed model fit by REML ['lmerMod']
Formula: sem_icc ~ factor(sample_size) * contrast + (1 | dv)
Data: rel_df_sample_size_summary
REML criterion at convergence: -10328
Scaled residuals:
Min 1Q Median 3Q Max
-3.662 -0.327 0.004 0.292 11.654
Random effects:
Groups Name Variance Std.Dev.
dv (Intercept) 0.0000301 0.00549
Residual 0.0000176 0.00419
Number of obs: 1365, groups: dv, 273
Fixed effects:
Estimate Std. Error t value
(Intercept) 0.020030 0.000543 36.9
factor(sample_size)50 -0.008025 0.000466 -17.2
factor(sample_size)75 -0.011408 0.000466 -24.5
factor(sample_size)100 -0.013955 0.000466 -29.9
factor(sample_size)125 -0.016135 0.000466 -34.6
contrastcontrast 0.017341 0.000851 20.4
factor(sample_size)50:contrastcontrast -0.007552 0.000731 -10.3
factor(sample_size)75:contrastcontrast -0.010997 0.000731 -15.0
factor(sample_size)100:contrastcontrast -0.012866 0.000731 -17.6
factor(sample_size)125:contrastcontrast -0.014519 0.000731 -19.9
Correlation of Fixed Effects:
(Intr) fc(_)50 fc(_)75 fc(_)100 fc(_)125 cntrst f(_)50:
fctr(sm_)50 -0.429
fctr(sm_)75 -0.429 0.500
fctr(s_)100 -0.429 0.500 0.500
fctr(s_)125 -0.429 0.500 0.500 0.500
cntrstcntrs -0.638 0.274 0.274 0.274 0.274
fctr(s_)50: 0.274 -0.638 -0.319 -0.319 -0.319 -0.429
fctr(s_)75: 0.274 -0.319 -0.638 -0.319 -0.319 -0.429 0.500
fctr(_)100: 0.274 -0.319 -0.319 -0.638 -0.319 -0.429 0.500
fctr(_)125: 0.274 -0.319 -0.319 -0.319 -0.638 -0.429 0.500
f(_)75: f(_)100:
fctr(sm_)50
fctr(sm_)75
fctr(s_)100
fctr(s_)125
cntrstcntrs
fctr(s_)50:
fctr(s_)75:
fctr(_)100: 0.500
fctr(_)125: 0.500 0.500
Does between subjects variance change with sample size?
Yes. Between subjects variance decreases with sample size. This is more pronounced for non-contrast measures.
This goes against my intuitions. Looking at the change in between subjects percentage of individual measures’ there seems to be a lot of inter-measure variance (more pronounced below for within subject variance). I’m not sure if there is something in common for the measures that show increasing between subjects variability with sample size and that separates them from those that show decreasing between subjects variability with sample size (the slight majority).
rel_df_sample_size_summary %>%
ggplot(aes(sample_size, mean_var_subs_pct))+
geom_line(aes(group = dv, color=ddm_raw), alpha = 0.1)+
geom_line(data = tmp, aes(sample_size,mean_var_subs_pct, color=ddm_raw))+
geom_point(data = tmp, aes(sample_size,mean_var_subs_pct, color=ddm_raw))+
facet_wrap(~contrast)+
ylab("Mean percentage of \n between subjects variance \n of 100 samples of size n")+
xlab("Sample size")+
theme(legend.title = element_blank(),
legend.position = "bottom")
summary(lmer(var_subs_pct ~ factor(sample_size) * contrast + (1|dv) + (1|iteration), rel_df_sample_size))
Linear mixed model fit by REML ['lmerMod']
Formula: var_subs_pct ~ factor(sample_size) * contrast + (1 | dv) + (1 |
iteration)
Data: rel_df_sample_size
REML criterion at convergence: 1071657
Scaled residuals:
Min 1Q Median 3Q Max
-4.737 -0.672 0.055 0.682 4.888
Random effects:
Groups Name Variance Std.Dev.
dv (Intercept) 134.62 11.602
iteration (Intercept) 0.65 0.806
Residual 148.33 12.179
Number of obs: 136500, groups: dv, 273; iteration, 100
Fixed effects:
Estimate Std. Error t value
(Intercept) 58.117 0.920 63.2
factor(sample_size)50 -2.441 0.135 -18.0
factor(sample_size)75 -4.261 0.135 -31.5
factor(sample_size)100 -6.056 0.135 -44.8
factor(sample_size)125 -7.715 0.135 -57.0
contrastcontrast -14.339 1.437 -10.0
factor(sample_size)50:contrastcontrast 1.756 0.212 8.3
factor(sample_size)75:contrastcontrast 3.260 0.212 15.4
factor(sample_size)100:contrastcontrast 5.018 0.212 23.6
factor(sample_size)125:contrastcontrast 6.650 0.212 31.3
Correlation of Fixed Effects:
(Intr) fc(_)50 fc(_)75 fc(_)100 fc(_)125 cntrst f(_)50:
fctr(sm_)50 -0.074
fctr(sm_)75 -0.074 0.500
fctr(s_)100 -0.074 0.500 0.500
fctr(s_)125 -0.074 0.500 0.500 0.500
cntrstcntrs -0.635 0.047 0.047 0.047 0.047
fctr(s_)50: 0.047 -0.638 -0.319 -0.319 -0.319 -0.074
fctr(s_)75: 0.047 -0.319 -0.638 -0.319 -0.319 -0.074 0.500
fctr(_)100: 0.047 -0.319 -0.319 -0.638 -0.319 -0.074 0.500
fctr(_)125: 0.047 -0.319 -0.319 -0.319 -0.638 -0.074 0.500
f(_)75: f(_)100:
fctr(sm_)50
fctr(sm_)75
fctr(s_)100
fctr(s_)125
cntrstcntrs
fctr(s_)50:
fctr(s_)75:
fctr(_)100: 0.500
fctr(_)125: 0.500 0.500
Does within subjects variance change with sample size?
Yes. This again goes against my intuition but here the inter-meausre differences are even more pronounced. There appears to be some measures for which the change in two measurements at different time points is larger the more subjects are tested and those that show a smaller decrease in within subject variance with larger sample sizes. I still don’t know if these two types of measures have anything that distinguishes them.
rel_df_sample_size_summary %>%
ggplot(aes(sample_size, mean_var_ind_pct))+
geom_line(aes(group = dv, color=ddm_raw), alpha = 0.1)+
geom_line(data = tmp, aes(sample_size,mean_var_ind_pct, color=ddm_raw))+
geom_point(data = tmp, aes(sample_size,mean_var_ind_pct, color=ddm_raw))+
facet_wrap(~contrast)+
ylab("Mean percentage of \n within subjects variance \n of 100 samples of size n")+
xlab("Sample size")+
theme(legend.title = element_blank(),
legend.position = "bottom")
summary(lmer(var_ind_pct ~ factor(sample_size) * contrast + (1|dv) + (1|iteration), rel_df_sample_size))
Linear mixed model fit by REML ['lmerMod']
Formula: var_ind_pct ~ factor(sample_size) * contrast + (1 | dv) + (1 |
iteration)
Data: rel_df_sample_size
REML criterion at convergence: 1161673
Scaled residuals:
Min 1Q Median 3Q Max
-3.973 -0.718 -0.121 0.665 4.342
Random effects:
Groups Name Variance Std.Dev.
dv (Intercept) 217.021 14.732
iteration (Intercept) 0.677 0.823
Residual 287.045 16.942
Number of obs: 136500, groups: dv, 273; iteration, 100
Fixed effects:
Estimate Std. Error t value
(Intercept) 20.167 1.168 17.3
factor(sample_size)50 3.178 0.188 16.9
factor(sample_size)75 5.627 0.188 29.9
factor(sample_size)100 8.057 0.188 42.8
factor(sample_size)125 10.250 0.188 54.4
contrastcontrast 3.862 1.827 2.1
factor(sample_size)50:contrastcontrast -2.043 0.295 -6.9
factor(sample_size)75:contrastcontrast -3.933 0.295 -13.3
factor(sample_size)100:contrastcontrast -6.321 0.295 -21.4
factor(sample_size)125:contrastcontrast -8.528 0.295 -28.9
Correlation of Fixed Effects:
(Intr) fc(_)50 fc(_)75 fc(_)100 fc(_)125 cntrst f(_)50:
fctr(sm_)50 -0.081
fctr(sm_)75 -0.081 0.500
fctr(s_)100 -0.081 0.500 0.500
fctr(s_)125 -0.081 0.500 0.500 0.500
cntrstcntrs -0.636 0.052 0.052 0.052 0.052
fctr(s_)50: 0.051 -0.638 -0.319 -0.319 -0.319 -0.081
fctr(s_)75: 0.051 -0.319 -0.638 -0.319 -0.319 -0.081 0.500
fctr(_)100: 0.051 -0.319 -0.319 -0.638 -0.319 -0.081 0.500
fctr(_)125: 0.051 -0.319 -0.319 -0.319 -0.638 -0.081 0.500
f(_)75: f(_)100:
fctr(sm_)50
fctr(sm_)75
fctr(s_)100
fctr(s_)125
cntrstcntrs
fctr(s_)50:
fctr(s_)75:
fctr(_)100: 0.500
fctr(_)125: 0.500 0.500
Does residual variance change with sample size?
rel_df_sample_size_summary %>%
ggplot(aes(sample_size, mean_var_resid_pct))+
geom_line(aes(group = dv, color=ddm_raw), alpha = 0.1)+
geom_line(data = tmp, aes(sample_size,mean_var_resid_pct, color=ddm_raw))+
geom_point(data = tmp, aes(sample_size,mean_var_resid_pct, color=ddm_raw))+
facet_wrap(~contrast)+
ylab("Mean percentage of residual variance \n of 100 samples of size n")+
xlab("Sample size")+
theme(legend.title = element_blank(),
legend.position = "bottom")
summary(lmer(var_resid_pct ~ factor(sample_size) * contrast + (1|dv) + (1|iteration), rel_df_sample_size))
Linear mixed model fit by REML ['lmerMod']
Formula: var_resid_pct ~ factor(sample_size) * contrast + (1 | dv) + (1 |
iteration)
Data: rel_df_sample_size
REML criterion at convergence: 940397
Scaled residuals:
Min 1Q Median 3Q Max
-5.248 -0.566 -0.007 0.570 6.472
Random effects:
Groups Name Variance Std.Dev.
dv (Intercept) 51.997 7.211
iteration (Intercept) 0.215 0.464
Residual 56.701 7.530
Number of obs: 136500, groups: dv, 273; iteration, 100
Fixed effects:
Estimate Std. Error t value
(Intercept) 21.7160 0.5715 38.0
factor(sample_size)50 -0.7372 0.0837 -8.8
factor(sample_size)75 -1.3662 0.0837 -16.3
factor(sample_size)100 -2.0010 0.0837 -23.9
factor(sample_size)125 -2.5352 0.0837 -30.3
contrastcontrast 10.4776 0.8933 11.7
factor(sample_size)50:contrastcontrast 0.2869 0.1312 2.2
factor(sample_size)75:contrastcontrast 0.6733 0.1312 5.1
factor(sample_size)100:contrastcontrast 1.3024 0.1312 9.9
factor(sample_size)125:contrastcontrast 1.8779 0.1312 14.3
Correlation of Fixed Effects:
(Intr) fc(_)50 fc(_)75 fc(_)100 fc(_)125 cntrst f(_)50:
fctr(sm_)50 -0.073
fctr(sm_)75 -0.073 0.500
fctr(s_)100 -0.073 0.500 0.500
fctr(s_)125 -0.073 0.500 0.500 0.500
cntrstcntrs -0.636 0.047 0.047 0.047 0.047
fctr(s_)50: 0.047 -0.638 -0.319 -0.319 -0.319 -0.073
fctr(s_)75: 0.047 -0.319 -0.638 -0.319 -0.319 -0.073 0.500
fctr(_)100: 0.047 -0.319 -0.319 -0.638 -0.319 -0.073 0.500
fctr(_)125: 0.047 -0.319 -0.319 -0.319 -0.638 -0.073 0.500
f(_)75: f(_)100:
fctr(sm_)50
fctr(sm_)75
fctr(s_)100
fctr(s_)125
cntrstcntrs
fctr(s_)50:
fctr(s_)75:
fctr(_)100: 0.500
fctr(_)125: 0.500 0.500
– Do DDM parameters capture similar processes as the raw measures in a given task or do they capture processes that are more similar across tasks? – Are these clusters more reliable than using either the raw or the DDM measures alone?
– Do raw or DDM measures (or factor scores) predict real world outcomes better?
Prof. Domingue recommended readings: file:///Users/zeynepenkavi/Downloads/10.1007%252Fs11336-006-1478-z.pdf A HIERARCHICAL FRAMEWORK FOR MODELING SPEED AND ACCURACY ON TEST ITEMS
file:///Users/zeynepenkavi/Downloads/v66i04.pdf Fitting Diffusion Item Response Theory Models for Responses and Response Times Using the R Package diffIRT